Introduction to R

1 Set Up

Use install.packages("package_name") to install packages from CRAN and library(package_name) to load packages for use in your current session.

# Installing a package 
# install.packages("tidyverse")

# Loading the package for use
# library(tidyverse)
pacman::p_load(tidyverse)

2 Data Exploration

  • Dataset Structure
# Data structure
diamonds %>% glimpse()
## Rows: 53,940
## Columns: 10
## $ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…
  • Top Rows in the Dataset
# top 6 rows
diamonds %>% head()
  • Distribution of Categorical variables
# How many cut types are there
diamonds %>% count(cut)
  • Dataset Summary Statistics
# Summary Statistics
diamonds %>% summary()
##      carat               cut        color        clarity          depth      
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065   Min.   :43.00  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258   1st Qu.:61.00  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194   Median :61.80  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171   Mean   :61.75  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066   3rd Qu.:62.50  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655   Max.   :79.00  
##                                     J: 2808   (Other): 2531                  
##      table           price             x                y         
##  Min.   :43.00   Min.   :  326   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710   1st Qu.: 4.720  
##  Median :57.00   Median : 2401   Median : 5.700   Median : 5.710  
##  Mean   :57.46   Mean   : 3933   Mean   : 5.731   Mean   : 5.735  
##  3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540   3rd Qu.: 6.540  
##  Max.   :95.00   Max.   :18823   Max.   :10.740   Max.   :58.900  
##                                                                   
##        z         
##  Min.   : 0.000  
##  1st Qu.: 2.910  
##  Median : 3.530  
##  Mean   : 3.539  
##  3rd Qu.: 4.040  
##  Max.   :31.800  
## 

3 Data Visualization

  • Most tidyverse functions take a dataset as their first argument

3.1 Scatter Plot

# Example of a Scatter Plot 
diamonds %>%                                                                    # the data
  sample_n(size = 1000) %>%                                                     # Sample a few rows
  ggplot(aes(x = carat, y = price, color = "lightsalmon"))+                     # Aesthetics mapping 
  geom_point(show.legend = F, alpha = .5)+                                      # Geom layer (Adding Scatter points)
  geom_smooth(method = "lm")+                                                   # Another geom layer (Add a smooth line)
  labs(title =  "Relationship between Price and Carat in the Diamonds Dataset")+# Adding plot title
  theme(legend.position = "none")+                                              # Remove unnecessary Legend
  theme_bw()                                                                    # Add a theme

3.2 Box Plot

# Example of a Box plot
diamonds %>%                                                             # the data
  ggplot(aes(x = cut, y = price, fill = cut))+                           # Aesthetics mapping 
  geom_boxplot(show.legend = F)+                                         # geom layer
  labs(title =  "Distribution of Price by Cut in the Diamonds Dataset")+ # adding plot title
  theme_bw()                                                             # Theme for the plot

3.3 Histogram

# Example of Histogram
diamonds %>%  
  ggplot(aes(x = price))+                                 
  geom_histogram(fill = "steelblue", bins = 30)+
  labs(title =  "Distribution of Price in the Diamonds Dataset")+ 
  theme_bw()

3.4 Line Graphs for Time Series Data

# Plotting time series data
library(timetk)

m4_daily %>%                     # the data set 
  filter(id == "D10") %>%        # filter for a specific group
  plot_time_series(              # Plotting function; two major arguments (date and value)
    .date_var = date, .value = value, .x_lab = "Date", .y_lab = "Value",.interactive = T
    )

3.5 Anomaly detection

# Checking Anomalies in data
m4_daily %>%                      # the data set 
  filter(id == "D10") %>%         # filter for a specific group
  plot_anomaly_diagnostics(       # Plotting function; two major Arguments (date and value)
    .date_var = date, .value = value, .x_lab = "Date", .y_lab = "Value",.interactive = T
  )

3.6 Visualizing Seasonality features in time series data

m4_daily %>%                    # the data set 
  filter(id == "D10") %>%       # filter for a specific group
  plot_seasonal_diagnostics(    # Plotting function; takes two major Arguments (date and value)
    .date_var = date, .value = value, .x_lab = "Date", .y_lab = "Value",.interactive = T
  )